library(Seurat)
library(Matrix)
library(ggplot2)
library(cowplot)
library(rafalib)
library(clustree)
library(venn)
library(dplyr)


if(!require(DoubletFinder)){
  remotes::install_github("chris-mcginnis-ucsf/DoubletFinder", upgrade = F)
  library(DoubletFinder)
}


# The cell cycle gene lists were generated as in the commented chunk. To avoid changes related to future versions of Seurat or BioMart, the lists have been hardcoded below, as generated June 2021.


# convertHumanGeneList <- function(x){
#   require("biomaRt")
#   human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
#   mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
#   
#   genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
#   
#   humanx <- unique(genesV2[, 2])
#   
#   # Print the first 6 genes found to the screen
#   #print(head(humanx))
#   return(humanx)
# }
# g2mFeat = convertHumanGeneList(cc.genes$g2m.genes)
# sFeat = convertHumanGeneList(cc.genes$s.genes)




g2mFeat = c("Smc4","Dlgap5","Ctcf","Cdc25c","Gtse1","Kif20b","Ttk","Tacc3","Top2a","Tpx2","Cdca3","Ncapd2",
"Cks2","Anp32e","G2e3","Lbr","Cdca2","Cks1brt","Ckap2","Hmgb2","Kif11","Nek2","Cenpe","Hjurp",
"Ect2","Aurkb","Cks1b","Kif23","Nuf2","Hmmr","Cdca8","Psrc1","Anln","Cdc20","Birc5","Ndc80",
"Rangap1","Ckap5","Kif2c","Cenpf","Nusap1","Cenpa","Aurka","Ube2c","Ckap2l","Mki67","Tubb4b","Bub1",
"Ccnb2")

sFeat = c("Msh2","Exo1","Mcm4","Rrm2","Mcm2","Chaf1b","Gmnn","Cdc45","Slbp","Ubr7","Cdc6",
"Rad51ap1","Rpa2","Hells","Fen1","Gins2","Uhrf1","Mcm6","Ung","Dscc1","Usp1","Clspn",
"Cdca7","Pola1","Nasp","Dtl","Mcm5","Wdr76","Prim1","Casp8ap2","Tipin","Blm","Rrm1",
"Brip1","Rad51","Tyms","E2f8","Rfc2","Ccne2","Pcna")

1 Load data

Raw sequences were processed in CellRanger v 6.0.2. Here, filtered h5-files as generated by CellRanger are combined into a Seurat object.

if(!file.exists("analysis/original_data.rds")){
  DMSO = Seurat::Read10X_h5(filename = "raw_data/G_DMSO/filtered_feature_bc_matrix.h5", use.names = T)
  GW = Seurat::Read10X_h5(filename = "raw_data/G_GW/filtered_feature_bc_matrix.h5", use.names = T)
  
  
  sdata.DMSO <- CreateSeuratObject(DMSO, project = "DMSO")
  sdata.GW <- CreateSeuratObject(GW, project = "GW")
  
  
  alldata <- merge(sdata.DMSO, sdata.GW, add.cell.ids = c("DMSO","GW"))
  
  rm(DMSO, GW, sdata.GW, sdata.DMSO)
  gc(verbose= FALSE)
  
  saveRDS(alldata, "analysis/original_data.rds")
}else{
  alldata = readRDS("analysis/original_data.rds")
}

Cell counts:

Var1 Freq
DMSO 14950
GW 13487

2 Quality control and filtering

2.1 Plot QC vars

Number of features and counts and percentage of mitochondrial and ribosomal RNA are plotted below. Red lines mark limits for filtering.

# Calculate % of mitochondrial and ribosomal genes per cell
alldata <- PercentageFeatureSet(alldata, "^mt-", col.name = "percent_mito")
alldata <- PercentageFeatureSet(alldata, "^Rp[sl]", col.name = "percent_ribo")

# Visualize QC variables with cutoffs
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
glist = VlnPlot(alldata, group.by = "orig.ident", features = feats, pt.size = 0.1, combine = FALSE)
names(glist) = feats

for(i in feats){
  glist[[i]] = glist[[i]] +
    theme(legend.position = "none")
}
cowplot::plot_grid(align = "v",plotlist = list(glist[["nFeature_RNA"]] + geom_hline(yintercept = 500, color = "red"),
glist[["nCount_RNA"]],
glist[["percent_mito"]] + geom_hline(yintercept = 10, color = "red") ,
glist[["percent_ribo"]] + geom_hline(yintercept = 5, color = "red")))

2.2 Filter cells and genes

The same QC as above are plotted below, only including cells and genes that pass filtering limits (cell limits: > 500 features & > 5% ribosomal RNA & < 10% mitochondrial RNA; gene limit: > 3 copies expressed). The top expressed genes are also shown. Mitochondrial genes are filtered out of the data.

# subset the object to only keep cells with > 500 features and genes expressed in > 3 copies
selected_c <- WhichCells(alldata, expression = nFeature_RNA > 500)
selected_f <- rownames(alldata)[Matrix::rowSums(alldata) > 3]

data.filt <- subset(alldata, features = selected_f, cells = selected_c)

selected_mito <- WhichCells(data.filt, expression = percent_mito < 10)
selected_ribo <- WhichCells(data.filt, expression = percent_ribo > 5)

# subset the object to only keep cells with > 5% ribosomal RNA and <10% mitochondrial RNA
data.filt <- subset(data.filt, cells = selected_mito)
data.filt <- subset(data.filt, cells = selected_ribo)



# Visualize QC variables in filtered data

feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")

VlnPlot(data.filt, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 2) + 
    NoLegend()

# Identify most highly expressed genes (% of counts)
par(mar = c(4, 8, 2, 1))
C <- data.filt@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell",
    col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)

# Filter Mitocondrial genes
data.filt <- data.filt[!grepl("^mt-", rownames(data.filt)), ]

2.3 Analyze cell cycle

The S-phase and G2M-phase scores are calculated based on the cell cycle gene lists defined in the Seurat package (translated to mouse genes using biomaRt).

# Before running CellCycleScoring the data need to be normalized and
# logtransformed.

data.filt = NormalizeData(data.filt)

data.filt <- CellCycleScoring(object = data.filt, g2m.features = g2mFeat, 
    s.features = sFeat)

VlnPlot(data.filt, features = c("S.Score", "G2M.Score"), group.by = "orig.ident", 
    ncol = 3, pt.size = 0.1)

2.4 Find Doublets

Doublets are identified and excluded using the DoubletFinder package.

data.filt = FindVariableFeatures(data.filt, verbose = F)
data.filt = ScaleData(data.filt, vars.to.regress = c("nFeature_RNA", "percent_mito"), 
    verbose = F)
data.filt = RunPCA(data.filt, verbose = F, npcs = 20)
data.filt = RunUMAP(data.filt, dims = 1:10, verbose = F)


nExp <- round(ncol(data.filt) * 0.08)  # expect 8% doublets based on 10,000 cells
data.filt <- doubletFinder_v3(data.filt, pN = 0.26, pK = 0.20, nExp = nExp, PCs = 1:10)

DF.name = colnames(data.filt@meta.data)[grepl("DF.classification", colnames(data.filt@meta.data))]

cowplot::plot_grid(ncol = 2, DimPlot(data.filt, group.by = "orig.ident") + NoAxes(), 
    DimPlot(data.filt, group.by = DF.name) + NoAxes())

VlnPlot(data.filt, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)

data.filt = data.filt[, data.filt@meta.data[, DF.name] == "Singlet"]
## [1] "Creating 9239 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

Final cell counts:

Var1 Freq
DMSO 12526
GW 11665

2.5 Save data

The filtered data is saved under analysis/filtered_data.rds

saveRDS(data.filt, file = "analysis/filtered_data.rds")
rm(list=ls())

3 Dimensionality reduction

3.1 Find variable genes

The most highly variable genes in the dataset are plotted below. The top 2000 genes are used for the following dimensionality reduction strategies.

alldata = readRDS("analysis/filtered_data.rds")

suppressWarnings(suppressMessages(alldata <- FindVariableFeatures(alldata, selection.method = "vst", 
    nfeatures = 2000, verbose = FALSE, assay = "RNA")))
top20 <- head(VariableFeatures(alldata), 20)

LabelPoints(plot = VariableFeaturePlot(alldata), points = top20, repel = TRUE)

# Z-score transformation

alldata <- ScaleData(alldata, vars.to.regress = c("percent_mito", "nFeature_RNA"), 
    assay = "RNA")

3.2 PCA

PCA is performed for 50 PCs.

alldata <- RunPCA(alldata, npcs = 50, verbose = F)


plot_grid(ncol = 3, DimPlot(alldata, reduction = "pca", group.by = "orig.ident", dims = 1:2) + NoLegend(),
    DimPlot(alldata, reduction = "pca", group.by = "orig.ident", dims = 3:4) + NoLegend(), 
    DimPlot(alldata, reduction = "pca", group.by = "orig.ident", dims = 5:6)  + 
            theme(legend.position = "bottom"))

VizDimLoadings(alldata, dims = 1:5, reduction = "pca", ncol = 5, balanced = T)

ElbowPlot(alldata, reduction = "pca", ndims = 50)

3.3 t-SNE & UMAP

t-SNE and UMAP are performed on the PCA reduction.

alldata <- RunTSNE(alldata, reduction = "pca", dims = 1:30, perplexity = 30, max_iter = 1000, 
    theta = 0.5, eta = 200, num_threads = 0)

alldata <- RunUMAP(alldata, reduction = "pca", dims = 1:30, n.components = 2, n.neighbors = 30, 
    n.epochs = 200, min.dist = 0.3, learning.rate = 1, spread = 1)


plot_grid(ncol = 3, DimPlot(alldata, reduction = "pca", group.by = "orig.ident") + 
            theme(legend.position = "bottom"), 
    DimPlot(alldata, reduction = "tsne", group.by = "orig.ident") + NoLegend(),
    DimPlot(alldata, reduction = "umap", group.by = "orig.ident") + NoLegend())

3.4 Save data

The Seurat object with PCA, t-SNE and UMAP reductions is saved under analysis/filtered_data_dr.rds

saveRDS(alldata, file = "analysis/filtered_data_dr.rds")
rm(list=ls())

4 Integration

4.1 Variable features by sample

Top 2000 variable features are identified for each sample separately. Overlap in top variable features between samples is shown as a Venn diagram.

alldata = readRDS("analysis/filtered_data_dr.rds")

alldata.list <- SplitObject(alldata, split.by = "orig.ident")

for (i in 1:length(alldata.list)) {
    alldata.list[[i]] <- NormalizeData(alldata.list[[i]], verbose = FALSE)
    alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst", 
        nfeatures = 2000, verbose = FALSE)
}

hvgs_per_dataset <- lapply(alldata.list, function(x) {
    x@assays$RNA@var.features
})
venn::venn(hvgs_per_dataset, opacity = 0.4, zcolor = (scales::hue_pal())(3), cexsn = 1, 
    cexil = 1, lwd = 1, col = "white", frame = F, borders = NA)

4.2 Integrate data

Samples are integrated using Seurat integration functionality with CCA.

alldata.anchors <- FindIntegrationAnchors(object.list = alldata.list, dims = 1:30, 
    reduction = "cca")


alldata.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA")

rm(alldata.list, alldata.anchors)
gc(verbose = FALSE)

# Run Dimensionality reduction on integrated space
alldata.int <- ScaleData(alldata.int, verbose = FALSE)
alldata.int <- RunPCA(alldata.int, npcs = 30, verbose = FALSE)
alldata.int <- RunUMAP(alldata.int, dims = 1:30)
alldata.int <- RunTSNE(alldata.int, dims = 1:30)

plot_grid(ncol = 3,
  DimPlot(alldata, reduction = "pca", group.by = "orig.ident") + NoAxes() + ggtitle("PCA raw_data"),
  DimPlot(alldata, reduction = "tsne", group.by = "orig.ident") + NoAxes() + ggtitle("tSNE raw_data"),
  DimPlot(alldata, reduction = "umap", group.by = "orig.ident") + NoAxes() + ggtitle("UMAP raw_data"),
  
  DimPlot(alldata.int, reduction = "pca", group.by = "orig.ident") + NoAxes() + ggtitle("PCA integrated"),
  DimPlot(alldata.int, reduction = "tsne", group.by = "orig.ident") + NoAxes() + ggtitle("tSNE integrated"),
  DimPlot(alldata.int, reduction = "umap", group.by = "orig.ident") + NoAxes() + ggtitle("UMAP integrated")
)

##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   3170286  169.4    5324025   284.4    5324025   284.4
## Vcells 487591494 3720.1 1529514480 11669.3 3200132041 24415.1

4.3 QC in integrated dataset

QC variables are visualized in UMAP of the integrated dataset.

FeaturePlot(alldata.int, reduction = "umap", 
            features = c("nFeature_RNA", "percent_mito","percent_ribo", "S.Score", "G2M.Score"), 
    order = T, slot = "data", combine = T)

4.4 Save data

The Seurat object with CCA integration is saved under analysis/filtered_data_int.rds

saveRDS(alldata.int, file = "analysis/filtered_data_int.rds")

rm(list=ls())

5 Clustering

alldata <- readRDS("analysis/filtered_data_int.rds")

5.1 Graph clustering

Clustering is performed using FindNeighbors on CCA assay and FindClusters using Louvain algorithm at several resolutions. Results are shown for resolutions 0.5, 1, 1.5 and 2. Resolution 0.5 is selected for further analysis.

DefaultAssay(alldata) = "CCA"

alldata <- FindNeighbors(alldata, dims = 1:30, k.param = 200, prune.SNN = 1/15)
# Clustering with louvain (algorithm 1)
for (res in c(0.1, 0.25, 0.5, 1, 1.5, 2)) {
    alldata <- FindClusters(alldata, graph.name = "CCA_snn", resolution = res, algorithm = 1)
}

# each time you run clustering, the data is stored in meta data columns:
# seurat_clusters - lastest results only CCA_snn_res.XX - for each different
# resolution you test.
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24191
## Number of edges: 8932737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9451
## Number of communities: 5
## Elapsed time: 44 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24191
## Number of edges: 8932737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9036
## Number of communities: 6
## Elapsed time: 37 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24191
## Number of edges: 8932737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8588
## Number of communities: 9
## Elapsed time: 44 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24191
## Number of edges: 8932737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7951
## Number of communities: 11
## Elapsed time: 35 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24191
## Number of edges: 8932737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7428
## Number of communities: 11
## Elapsed time: 33 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 24191
## Number of edges: 8932737
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6946
## Number of communities: 17
## Elapsed time: 28 seconds
plot_grid(ncol = 2, DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.0.5", label = TRUE) + 
            ggtitle("louvain_0.5") + NoLegend(), 
    DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.1", label = TRUE) + 
      ggtitle("louvain_1") + NoLegend(), 
    DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.1.5", label = TRUE) +
      ggtitle("louvain_1.5") + NoLegend(),
    DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.2", label = TRUE) +
      ggtitle("louvain_2") + NoLegend())

clustree(alldata@meta.data, prefix = "CCA_snn_res.")

5.2 QC, CCA louvain 0.5

QC variables are visualized for each cluster as established with the Louvain algorithm at resolution 0.5.

# Set the identity as louvain with resolution 0.5
sel.clust = "CCA_snn_res.0.5"

alldata <- SetIdent(alldata, value = sel.clust)


VlnPlot(alldata, features = c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", 
                              "S.Score", "G2M.Score"), group.by = "CCA_snn_res.0.5" , 
    ncol = 3, pt.size = 0.1)

Number of cells per cluster:

DMSO GW
0 2543 2056
1 1978 1998
2 2033 1821
3 1848 1741
4 1345 1238
5 1158 1299
6 970 926
7 481 420
8 170 166

5.3 Comparison between samples

alldata$ident_clust = paste0(alldata@meta.data[, sel.clust],"_",alldata$orig.ident)

alldata <- ScaleData(alldata, assay = "RNA")

ggplotColours <- function(n = 6, h = c(0, 360) + 15){
  if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
  hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}

color_list <- ggplotColours(n=length(unique(alldata@meta.data[, sel.clust])))



plot_grid(ncol = 3, DimPlot(alldata, label = T) + NoAxes() + theme(legend.position = "none"),
        DimPlot(alldata, cells = alldata$orig.ident == "DMSO") + NoAxes() + 
          labs(title="DMSO") + theme(legend.position = "none", plot.title = element_text(hjust = 0.5)), 
        DimPlot(alldata, cells = alldata$orig.ident == "GW") + NoAxes() + 
          labs(title="GW") + theme(legend.position = "none", plot.title = element_text(hjust = 0.5)))

frequencies = as.data.frame(table(data.frame(cluster=alldata@meta.data[, sel.clust],
                 sample = alldata$orig.ident)))
frequencies$Proportion=NA
for(i in 1:nrow(frequencies)){
  frequencies$Proportion[i] = frequencies$Freq[i]/sum(frequencies$Freq[frequencies$sample==frequencies$sample[i]])
}


ggplot(frequencies, aes(fill=sample, y = Proportion, x = cluster)) +
  geom_bar(stat = "identity",position="dodge", size = 0.3, color = "black") +
    theme(panel.background = element_blank(),axis.ticks.y = element_blank(), 
          axis.text = element_text(size = 12)) +
      scale_fill_manual(values = c("#dfdfde", "#bdc9e2"))

5.4 Save data

The Seurat object with clustering information is saved under analysis/filtered_data_clus.rds

saveRDS(alldata, file = "analysis/filtered_data_clus.rds")

6 Differential expression

6.1 Cluster marker genes

Markers for each cluster are identified with thresholds logFC > 0.2, p < 0.01, difference in % > 0.2. Top 25 (by p-value) genes are shown in bar graphs and heatmap.

markers_genes <- FindAllMarkers(alldata, logfc.threshold = 0.2, test.use = "wilcox", 
    min.pct = 0.1, min.diff.pct = 0.2, only.pos = TRUE, max.cells.per.ident = 50, 
    assay = "RNA", random.seed = 42)


write.csv2(markers_genes, "analysis/diffexpr_clusters_ribo5.csv")

top25 <- markers_genes %>% group_by(cluster) %>% top_n(-25, p_val)
mypar(2, 5, mar = c(4, 6, 3, 1))
for (i in unique(top25$cluster)) {
    barplot(sort(setNames(top25$avg_logFC, top25$gene)[top25$cluster == i], F), horiz = T, 
        las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i", xlab="logFC")
    abline(v = c(0, 0.2), lty = c(1, 2))
}

DoHeatmap(alldata, features =  as.character(unique(top25$gene)), assay ="RNA",label = FALSE) + labs(color = "Cluster")

Comparisons between clusters are shown for the top 5 genes (by p-value) of each cluster.

top5 <- markers_genes %>% group_by(cluster) %>% top_n(-5, p_val_adj)

DotPlot(alldata, features = rev(as.character(unique(top5$gene))), group.by = sel.clust, 
    assay = "RNA") + coord_flip()

Top 3 gens (by p-value) for each cluster are visualized with violin plots.

top3 <- top5 %>% group_by(cluster) %>% top_n(-3, p_val)


# set pt.size to zero if you do not want all the points to hide the violin
# shapes, or to a small value like 0.1
VlnPlot(alldata, features = as.character(unique(top3$gene)), ncol = 5, group.by = sel.clust, 
    assay = "RNA", pt.size = -1)

7 Differential expression between samples

DMSO and GW3965 conditions are compared for each cluster separately. DE genes are identified with thresholds logFC > 0.2, p < 0.01. Top 25 genes (by p-value) are shown in bar graphs. Top 5 genes (by p-value) are shown in violin plots. All DE genes across clusters are shown as a dot plot and a heatmap. A list of all genes (no thresholds) is also generated and saved to the analysis folder for analysis of specific genes.

DGE_list = list()

for(i in 0:max(as.numeric(as.character(alldata@meta.data[, sel.clust])))){
  # select all cells in cluster i
  cell_selection <- subset(alldata, cells = colnames(alldata)[alldata@meta.data[, sel.clust] == 
      i])
  cell_selection <- SetIdent(cell_selection, value = "orig.ident")
  # Compute differentiall expression
  DGE_cell_selection <- FindAllMarkers(cell_selection, logfc.threshold = 0.2, test.use = "wilcox", 
      min.pct = 0.1, only.pos = TRUE, max.cells.per.ident = 150,
      assay = "RNA", random.seed = 42)
  DGE_list[[i+1]] = DGE_cell_selection
}

write.csv2(do.call("rbind", lapply(1:length(DGE_list), function(x){xdf = DGE_list[[x]]; xdf$cluster_no = x-1; xdf})), "analysis/diffexpr_GWvsDMSO_clusters.csv")
for(i in 1:length(unique(alldata@meta.data[, sel.clust]))){
  mypar(1, 2, mar = c(4, 6, 3, 1))
  thiscluster = DGE_list[[i]]
  
  top25_cell_selection <- DGE_list[[i]] %>% group_by(cluster) %>% top_n(-25, p_val)
  
  for (j in c("DMSO","GW")) {
    if(sum(top25_cell_selection$cluster == j)>0){
      barplot(sort(setNames(top25_cell_selection$avg_logFC, top25_cell_selection$gene)[top25_cell_selection$cluster == j], F), horiz = T, 
          las = 1, main = paste("Cluster",i-1,"Up in",j), border = "white", yaxs = "i", xlab = "logFC")
      abline(v = c(0, 0.20), lty = c(1, 2))
    }
  }

  
  top5_cell_selection <- DGE_list[[i]] %>% group_by(cluster) %>% top_n(-5, p_val)

  plotlist = VlnPlot(cell_selection, features = as.character(unique(top5_cell_selection$gene)), 
      ncol = 5, group.by = "orig.ident", assay = "RNA", pt.size = 0.1, combine = FALSE)
  
  plotlist = lapply(1:length(plotlist), function(i){plotlist[[i]] + 
      labs(title = paste(top5_cell_selection$gene[[i]])) +
      theme(legend.position = "none",
            plot.title = element_text(size = 12),
            axis.title = element_text(size = 10),
            axis.text = element_text(size = 10))})
  
  cowplotlist = list(ggdraw() + 
                      draw_label(
                        paste("Cluster",i-1),
                        fontface = 'bold',
                        x = 0,
                        hjust = 0,
                        size = 16
                      ) +
                      theme(
                        # add margin on the left of the drawing canvas,
                        # so title is aligned with left edge of first plot
                        plot.margin = margin(0, 0, 0, 7)
                      ))
  if(sum(top5_cell_selection$cluster=="DMSO")>0){
    cowplotlist[[length(cowplotlist)+1]] = ggdraw() + 
    draw_label(
      "Up in DMSO",
      fontface = 'bold',
      x = 0,
      hjust = 0
    ) +
    theme(
      # add margin on the left of the drawing canvas,
      # so title is aligned with left edge of first plot
      plot.margin = margin(0, 0, 0, 7)
    )
    cowplotlist[[length(cowplotlist)+1]] = cowplot::plot_grid(plotlist = plotlist[top5_cell_selection$cluster=="DMSO"], ncol = 5)
  }
  if(sum(top5_cell_selection$cluster=="GW")>0){
    cowplotlist[[length(cowplotlist)+1]] = ggdraw() + 
    draw_label(
      "Up in GW",
      fontface = 'bold',
      x = 0,
      hjust = 0
    ) +
    theme(
      # add margin on the left of the drawing canvas,
      # so title is aligned with left edge of first plot
      plot.margin = margin(0, 0, 0, 7)
    )
    cowplotlist[[length(cowplotlist)+1]] = cowplot::plot_grid(plotlist = plotlist[top5_cell_selection$cluster=="GW"], ncol = 5)
  }
  print(plot_grid(plotlist = cowplotlist,
    ncol = 1,
    # rel_heights values control vertical title margins
    rel_heights = c(0.15, 0.1, 1, 0.1, 1)[1:length(cowplotlist)]
  ))
}

all_cluster_DGE = do.call("rbind", lapply(1:length(DGE_list), function(x){xdf = DGE_list[[x]]; xdf$cluster_no = x-1; xdf}))
all_cluster_DGE$gene = factor(all_cluster_DGE$gene, levels = unique(sort(all_cluster_DGE$gene)))
  
  
cowplot::plot_grid(ggplot(all_cluster_DGE[all_cluster_DGE$cluster=="DMSO",], aes(y =  cluster_no, x = factor(gene, levels = sort(unique(gene), decreasing = TRUE)))) + geom_point(aes(size = -log10(p_val), color = avg_logFC)) +
        theme(panel.background = element_blank(), panel.grid.minor = element_line(color="grey60"),
              axis.text = element_text(size = 10),
        axis.ticks.x = element_blank(), legend.key = element_blank(),
        legend.position = "bottom",  
        legend.box = "vertical") + 
        scale_y_continuous(breaks = min(all_cluster_DGE$cluster_no):max(all_cluster_DGE$cluster_no)) + 
          scale_color_gradientn(colors = c("#CCCCFF", "#0000AA"), limits = c(0,max(all_cluster_DGE$avg_logFC))) +
          coord_flip(clip ="off") +
        labs(x = "Gene", y = "Cluster", title = "Up in DMSO") +
          scale_size(limits = c(0,max(-log10(all_cluster_DGE$p_val)))),
ggplot(all_cluster_DGE[all_cluster_DGE$cluster=="GW",], aes(y =  cluster_no, 
                                                            x  = factor(gene, levels = sort(unique(gene),
                                                                                            decreasing = TRUE)))) +
  geom_point(aes(size = -log10(p_val), color = avg_logFC)) +
        theme(panel.background = element_blank(), panel.grid.minor = element_line(color="grey60"),
              axis.text = element_text(size = 10),
        axis.ticks.x = element_blank(), legend.key = element_blank(),
        legend.position = "bottom",  
        legend.box = "vertical") + 
        scale_y_continuous(breaks = min(all_cluster_DGE$cluster_no):max(all_cluster_DGE$cluster_no)) + 
  coord_flip(clip ="off") +
        labs(x = "Gene", y = "Cluster", title = "Up in GW3965") +
          scale_size(limits = c(0,max(-log10(all_cluster_DGE$p_val)))) +
  scale_color_gradientn(colors = c("#FFCCCC", "#AA0000"), limits = c(0,max(all_cluster_DGE$avg_logFC))),
ncol = 2, align = "h")

hmgenes = unlist(lapply(DGE_list, function(x){x$gene}))
if(sum(apply(table(hmgenes, unlist(lapply(DGE_list, function(x){x$cluster}))), 1, function(x){sum(x==0)})!=1)>0){
  print("Opposite genes!")
}else{
  hmgenes = unique(hmgenes)
  degenes = do.call("rbind", DGE_list)[hmgenes,]
  degenes = degenes[order(degenes$cluster),]
  
  DoHeatmap(alldata, features = degenes$gene, assay = "RNA", label = FALSE, group.by = "ident_clust",
            group.colors = rep(color_list,rep(2,9))) +
    theme(legend.position = "bottom")

}

DGE_list = list()

for(i in 0:(length(unique(alldata@meta.data[, sel.clust]))-1)){
  # select all cells in cluster i
  cell_selection <- subset(alldata, cells = colnames(alldata)[alldata@meta.data[, sel.clust] == 
      i])
  cell_selection <- SetIdent(cell_selection, value = "orig.ident")
  # Compute differentiall expression
  DGE_cell_selection <- FindAllMarkers(cell_selection, logfc.threshold = -Inf, test.use = "wilcox", 
      min.pct = -Inf, only.pos = TRUE, max.cells.per.ident = 150,  return.thresh = 1, 
      assay = "RNA", random.seed = 3415142)
  DGE_list[[i+1]] = DGE_cell_selection
}


write.csv2(do.call("rbind", lapply(1:length(DGE_list), function(x){xdf = DGE_list[[x]]; xdf$cluster_no = x-1; xdf})),  "analysis/diffexpr_GWvsDMSO_clusters_nothreshold.csv")

8 Genes of interest

getsignif = function(ps){
  sigs = rep("",length(ps))
  sigs[ps<0.05] = "*"
  sigs[ps<0.01] = "**"
  sigs[ps<0.001] = "***"
  sigs[ps<0.0001] = "****"
  sigs[ps<0.00001] = "*****"
  sigs[ps<0.000001] = "******"
  return(sigs)
}


egfgenes = c("Areg","Ereg","Tgfa","Egf","Egfr")


wntgenes = c("Ascl2", "Axin2", "Wnt3", "Wnt9b")

notchgenes = c("Dll1", "Dll4", "Hes1", "Jag1", "Jag2", "Notch1", "Notch2", "Notch3")
alldata@active.assay="RNA"

allLogFCs = read.csv2("analysis/diffexpr_GWvsDMSO_clusters_nothreshold.csv")

allLogFCs$logFC = allLogFCs$avg_logFC*c(-1,1)[(allLogFCs$cluster=="GW")+1]
allLogFCs$Cluster = factor(allLogFCs$cluster_no)
allLogFCs$sig = getsignif(allLogFCs$p_val)

9 Growth pathways logFC plots

egfplot = (ggplot(allLogFCs[allLogFCs$gene %in% egfgenes,], aes(fill = Cluster, y = logFC, x = gene)) + 
             geom_bar(stat = "identity", position = "dodge", width = 0.75) + 
    geom_text(aes(label = sig, #label=signif(p_val, 2),
                  y = pmax(logFC, 0)), vjust = -1, position = position_dodge(width = 0.75)) + 
    theme(panel.background = element_blank(), axis.ticks.x = element_blank()))



wntplot = (ggplot(allLogFCs[allLogFCs$gene %in% wntgenes,], aes(fill = Cluster, y = logFC, x = gene)) + 
             geom_bar(stat = "identity", position = "dodge", width = 0.75) + 
    geom_text(aes(label = sig, #label=signif(p_val, 2),
                  y = pmax(logFC, 0)), vjust = -1, position = position_dodge(width = 0.75)) + 
    theme(panel.background = element_blank(), axis.ticks.x = element_blank()))


notchplot = (ggplot(allLogFCs[allLogFCs$gene %in% notchgenes,], 
                    aes(fill = Cluster, y = logFC, x = gene)) + 
               geom_bar(stat = "identity", position = "dodge", width = 0.75) + 
    geom_text(aes(label = sig, #label=signif(p_val, 2),
                  y = pmax(logFC, 0)), vjust = -1, position = position_dodge(width = 0.75)) + 
    theme(panel.background = element_blank(), axis.ticks.x = element_blank()))


mymax = max(egfplot$data[,"logFC"],wntplot$data[,"logFC"],notchplot$data[,"logFC"])
mymin = min(egfplot$data[,"logFC"],wntplot$data[,"logFC"],notchplot$data[,"logFC"])




cowplot::plot_grid(
                   wntplot + 
                     scale_y_continuous(limits = c(mymin,mymax*1.1)) + 
                     theme(axis.text.x = element_text(size =12), axis.title.x = element_blank(), legend.position = "none"),
                   notchplot + 
                     scale_y_continuous(limits = c(mymin,mymax*1.1)) +
                     theme(axis.text.x = element_text(size =12), axis.title.x = element_blank(), legend.position = "none"),
                   egfplot + 
                     scale_y_continuous(limits = c(mymin,mymax*1.1)) + 
                     theme(axis.text.x = element_text(size =12), axis.title.x = element_blank()),
                   ncol = 3)

pwLogFCs = allLogFCs[allLogFCs$gene %in% c(egfgenes,wntgenes,notchgenes),]
pwLogFCs$gene = factor(pwLogFCs$gene, levels = c(egfgenes,wntgenes,notchgenes))
ggplot(pwLogFCs, aes(x = gene, y = logFC, fill = Cluster)) + 
  geom_bar(stat ="identity", position="dodge") + 
  geom_text(aes(label = sig, y = pmax(logFC, 0)), vjust = 0, position = position_dodge(width = 0.9)) +
  theme(panel.background = element_blank())

10 Areg and Abca1 expression barplots

Abca1df = cbind(alldata@assays$RNA@data["Abca1",], alldata@meta.data) 
colnames(Abca1df)[1] = "Abca1"
class(Abca1df)


plot_grid(
  ggplot(allLogFCs[allLogFCs$gene=="Areg",], aes(x = factor(Cluster,levels=100:0), y = logFC, fill = Cluster)) + geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) + 
    labs(x = "Cluster", y = "logFC (GW3965/DMSO)", title = "Areg") + 
    geom_text(aes(label=sig, y = pmax(logFC, 0)), vjust = 0, hjust = 2) + 
      theme(panel.background = element_blank(), plot.title = element_text(hjust = 0.5), axis.ticks.x = element_blank()) + 
    coord_flip(),
  
  ggplot(allLogFCs[allLogFCs$gene=="Abca1",], aes(x = factor(Cluster,levels=100:0), y = logFC, fill = Cluster)) + 
  geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
  labs(x = "Cluster", y = "logFC (GW3965/DMSO)", title = "Abca1") + 
    geom_text(aes(label=sig, y = pmax(logFC, 0)), vjust = 0, hjust = 2) + 
      theme(panel.background = element_blank(), plot.title = element_text(hjust = 0.5), axis.ticks.x = element_blank()) + 
    coord_flip(),
  
  ggplot(Abca1df  , aes(x = Abca1, y = factor(CCA_snn_res.0.5,levels = 100:0), fill = factor(orig.ident, levels = c("GW","DMSO")))) + 
    stat_summary(geom="bar", position = position_dodge(width = 0.9) ) +
    stat_summary(geom = "errorbar", position = position_dodge(width = 0.9), width = 0.4) +
    theme(panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + 
    labs(y = "Cluster", fill = "", x = "Expression Level", title = "Abca1") +
    scale_fill_manual(values = c(DMSO = "#dfdfde", GW = "#bdc9e2")),

  

ncol = 3)

## [1] "data.frame"
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /Users/jenfra/miniconda3/envs/Das_RNASeq/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] KernSmooth_2.23-20  fields_12.3         viridis_0.6.1      
##  [4] viridisLite_0.4.0   spam_2.6-0          dotCall64_1.0-1    
##  [7] DoubletFinder_2.0.3 dplyr_1.0.6         venn_1.10          
## [10] clustree_0.4.3      ggraph_2.0.5        rafalib_1.0.0      
## [13] pheatmap_1.0.12     cowplot_1.1.1       ggplot2_3.3.3      
## [16] Matrix_1.3-2        Seurat_3.1.2       
## 
## loaded via a namespace (and not attached):
##   [1] TH.data_1.0-10      Rtsne_0.15          colorspace_2.0-1   
##   [4] ellipsis_0.3.2      ggridges_0.5.3      farver_2.1.0       
##   [7] leiden_0.3.8        listenv_0.8.0       graphlayouts_0.7.1 
##  [10] ggrepel_0.9.1       fansi_0.5.0         mvtnorm_1.1-1      
##  [13] mathjaxr_1.4-0      codetools_0.2-18    splines_3.6.3      
##  [16] R.methodsS3_1.8.1   mnormt_2.0.2        knitr_1.33         
##  [19] TFisher_0.2.0       polyclip_1.10-0     jsonlite_1.7.2     
##  [22] ica_1.0-2           cluster_2.1.2       png_0.1-7          
##  [25] R.oo_1.24.0         uwot_0.1.10         ggforce_0.3.3      
##  [28] sctransform_0.3.2   compiler_3.6.3      httr_1.4.2         
##  [31] assertthat_0.2.1    lazyeval_0.2.2      tweenr_1.0.2       
##  [34] admisc_0.15         htmltools_0.5.1.1   tools_3.6.3        
##  [37] rsvd_1.0.3          igraph_1.2.6        gtable_0.3.0       
##  [40] glue_1.4.2          RANN_2.6.1          reshape2_1.4.4     
##  [43] maps_3.3.0          Rcpp_1.0.6          Biobase_2.46.0     
##  [46] jquerylib_0.1.4     vctrs_0.3.8         multtest_2.42.0    
##  [49] ape_5.5             nlme_3.1-152        lmtest_0.9-38      
##  [52] xfun_0.23           stringr_1.4.0       globals_0.14.0     
##  [55] rbibutils_2.1.1     lifecycle_1.0.0     irlba_2.3.3        
##  [58] future_1.21.0       MASS_7.3-54         zoo_1.8-9          
##  [61] scales_1.1.1        tidygraph_1.2.0     parallel_3.6.3     
##  [64] sandwich_3.0-1      RColorBrewer_1.1-2  yaml_2.2.1         
##  [67] reticulate_1.20     pbapply_1.4-3       gridExtra_2.3      
##  [70] sass_0.4.0          stringi_1.6.2       highr_0.9          
##  [73] mutoss_0.1-12       plotrix_3.8-1       BiocGenerics_0.32.0
##  [76] Rdpack_2.1.2        SDMTools_1.1-221.2  rlang_0.4.11       
##  [79] pkgconfig_2.0.3     matrixStats_0.59.0  evaluate_0.14      
##  [82] lattice_0.20-44     ROCR_1.0-11         purrr_0.3.4        
##  [85] htmlwidgets_1.5.3   tidyselect_1.1.1    parallelly_1.25.0  
##  [88] RcppAnnoy_0.0.18    plyr_1.8.6          magrittr_2.0.1     
##  [91] bookdown_0.22       R6_2.5.0            generics_0.1.0     
##  [94] multcomp_1.4-17     DBI_1.1.1           withr_2.4.2        
##  [97] pillar_1.6.1        sn_2.0.0            fitdistrplus_1.1-5 
## [100] survival_3.2-11     tsne_0.1-3          tibble_3.1.2       
## [103] future.apply_1.7.0  crayon_1.4.1        utf8_1.2.1         
## [106] tmvnsim_1.0-2       plotly_4.9.3        rmarkdown_2.8      
## [109] data.table_1.14.0   metap_1.4           digest_0.6.27      
## [112] tidyr_1.1.3         numDeriv_2016.8-1.1 R.utils_2.10.1     
## [115] stats4_3.6.3        munsell_0.5.0       bslib_0.2.5.1